TL;DR

Applying our zero-inflated model to the olfactory data.

Unnormalized data

We select only the cells that pass QC, color coded by the experimental time point. I will focus on the top 1,000 most variable genes. Note that the data are not public, hence it should not work other than on Davide’s computer.

load("Expt4c2b_filtdata.Rda")
load("E4c2b_slingshot_wsforkelly.RData")

To speed up the computations, I will focus on the top 1,000 most variable genes.

names(batch) <- colnames(counts)

counts <- counts[,names(clus.labels)]
batch <- droplevels(batch[names(clus.labels)])
qc <- qc[names(clus.labels),]

vars <- rowVars(log1p(counts))
names(vars) <- rownames(counts)
vars <- sort(vars, decreasing = TRUE)
core <- counts[names(vars)[1:1000],]

PCA

First, let’s look at PCA (of the log counts) for reference.

par(mfrow=c(1, 2))
detection_rate <- colSums(core>0)
coverage <- colSums(core)
colMerged <- cc_rev[clus.labels]

pca <- prcomp(t(log1p(core)))
plot(pca$x, col=colMerged, pch=19, main="PCA of log-counts, centered not scaled")

fq <- EDASeq::betweenLaneNormalization(counts, which="full")

pcafq <- prcomp(t(log1p(fq)))
plot(pcafq$x, col=colMerged, pch=19, main="PCA of FQ log-counts, centered not scaled")

The first plot is the unnormalized data and the second plot is after full-quantile normalization, which is what Russell and Diya used for the paper.

They found that to fully explain the differences between clusters, we need three dimensions.

pairs(pcafq$x[,1:3], col=colMerged, pch=19, main="PCA of FQ log-counts, centered not scaled")

df <- data.frame(PC1=pcafq$x[,1], PC2=pcafq$x[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)

print(cor(df, method="spearman"))
##                        PC1        PC2 detection_rate    coverage
## PC1             1.00000000 -0.0432585      0.2643194 -0.02214266
## PC2            -0.04325850  1.0000000      0.6508453  0.19979644
## detection_rate  0.26431940  0.6508453      1.0000000  0.41994811
## coverage       -0.02214266  0.1997964      0.4199481  1.00000000

Even after full-quantile normalization, there is some residual correlation between PC2 and detection rate.

ZINB

print(system.time(zinb_Vall <- zinbFit(core, ncores = 3, K = 3)))
##    user  system elapsed 
## 221.201  37.048 108.063
pairs(zinb_Vall@W, col=colMerged, pch=19, main="ZINB")

qcpca <- prcomp(qc, center=TRUE, scale.=TRUE)

df <- data.frame(W1=zinb_Vall@W[,1], W2=zinb_Vall@W[,2], W3=zinb_Vall@W[,3], gamma_mu = zinb_Vall@gamma_mu[1,], gamma_pi = zinb_Vall@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage, quality=qcpca$x[,1])
pairs(df, pch=19, col=colMerged)

print(cor(df, method="spearman"))
##                         W1          W2           W3    gamma_mu
## W1              1.00000000 -0.16601378 -0.104938063  0.03575507
## W2             -0.16601378  1.00000000  0.040478697  0.01093061
## W3             -0.10493806  0.04047870  1.000000000 -0.17209802
## gamma_mu        0.03575507  0.01093061 -0.172098021  1.00000000
## gamma_pi        0.26329101  0.05547144 -0.018388095 -0.09512231
## detection_rate -0.34471857 -0.13187287 -0.023141527  0.22821064
## coverage        0.08169276 -0.04792555  0.008233665  0.90198032
## quality         0.10834219  0.08748443 -0.006226189 -0.55756193
##                   gamma_pi detection_rate     coverage      quality
## W1              0.26329101    -0.34471857  0.081692763  0.108342193
## W2              0.05547144    -0.13187287 -0.047925554  0.087484425
## W3             -0.01838809    -0.02314153  0.008233665 -0.006226189
## gamma_mu       -0.09512231     0.22821064  0.901980318 -0.557561930
## gamma_pi        1.00000000    -0.96579515 -0.318289834  0.298937557
## detection_rate -0.96579515     1.00000000  0.419948109 -0.347477558
## coverage       -0.31828983     0.41994811  1.000000000 -0.609447350
## quality         0.29893756    -0.34747756 -0.609447350  1.000000000

Accounting for quality

mod <- model.matrix(~qcpca$x[,1:5])
print(system.time(zinb_3 <- zinbFit(core, ncores = 3, K = 3, X=mod)))
##    user  system elapsed 
## 380.032  40.190 172.770
pairs(zinb_3@W, col=colMerged, pch=19, main="ZINB")

#total number of detected genes in the cell
df <- data.frame(W1=zinb_3@W[,1], W2=zinb_3@W[,2], W3=zinb_3@W[,3], gamma_mu = zinb_3@gamma_mu[1,], gamma_pi = zinb_3@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)

print(cor(df, method="spearman"))
##                          W1           W2           W3    gamma_mu
## W1              1.000000000  0.008567876  0.130462367  0.09893941
## W2              0.008567876  1.000000000  0.001914656  0.11134599
## W3              0.130462367  0.001914656  1.000000000 -0.03719696
## gamma_mu        0.098939405  0.111345989 -0.037196957  1.00000000
## gamma_pi       -0.072167019  0.058346532 -0.129945032 -0.28471602
## detection_rate  0.141119099 -0.130270900  0.153378775  0.48856596
## coverage        0.024521464  0.121797564  0.106339543  0.86995162
##                   gamma_pi detection_rate    coverage
## W1             -0.07216702      0.1411191  0.02452146
## W2              0.05834653     -0.1302709  0.12179756
## W3             -0.12994503      0.1533788  0.10633954
## gamma_mu       -0.28471602      0.4885660  0.86995162
## gamma_pi        1.00000000     -0.9405971 -0.25089634
## detection_rate -0.94059713      1.0000000  0.41994811
## coverage       -0.25089634      0.4199481  1.00000000

Four dimensions

print(system.time(zinb_4 <- zinbFit(core, ncores = 3, K = 4)))
##    user  system elapsed 
## 252.835  28.154 116.245
pairs(zinb_4@W, col=colMerged, pch=19, main="ZINB")